library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr) ; library(ggExtra)
library(WGCNA)
library(expss)
library(polycor)
library(biomaRt)
library(clusterProfiler) # ORA
library(foreach) ; library(doParallel)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load preprocessed dataset (preprocessing code in01_data_preprocessing.Rmd) and clustering (pipeline in 05_WGCNA.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
rownames(datExpr) = datGenes$ensembl_gene_id
DE_info = DE_info %>% data.frame
datMeta = datMeta %>% mutate(ID = title)


# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]


# Clusterings
clusterings = read_csv('./../Data/clusters.csv')


# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('entrezgene'=rownames(.) %>% as.numeric) %>% 
             dplyr::rename('padj' = adj.P.Val, 'log2FoldChange' = logFC) %>%
             left_join(datGenes %>% dplyr::select(entrezgene, ensembl_gene_id) %>% 
                       dplyr::rename('ID' = ensembl_gene_id), by = 'entrezgene') %>% 
             left_join(SFARI_genes, by='ID') %>% 
             mutate(`gene-score`=ifelse(is.na(`gene-score`), 'Others', `gene-score`)) %>%
             distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
             mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
             mutate(gene.score=ifelse(`gene-score`=='Others' & Neuronal==1, 'Neuronal', `gene-score`), 
                    significant=padj<0.05 & !is.na(padj)) %>%
             mutate(Group = factor(ifelse(gene.score %in% c('Neuronal','Others'), gene.score, 'SFARI'), 
                    levels = c('SFARI', 'Neuronal', 'Others'))) %>% left_join(clusterings, by='ID')


rm(DE_info, GO_annotations, clusterings, efit)

Dynamic Tree vs Dyamic Hybrid


Comparing module sizes


Dynamic Tree creates less and more balanced modules (opposite to Gandal’s modules)

plot_data = table(genes_info$DynamicTree) %>% data.frame %>% arrange(desc(Freq)) %>% filter(Var1 != 'gray')

p1 = plot_data %>% ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + 
     geom_bar(stat='identity', fill=plot_data$Var1) + 
     geom_hline(yintercept = mean(plot_data$Freq), color = 'gray',linetype = 'dashed') +
     ggtitle('Modules using Dynamic Tree') + ylab('Number of genes') + xlab('Module') + 
     theme_minimal() + theme(axis.text.x = element_blank())

plot_data = table(genes_info$DynamicHybrid) %>% data.frame %>% arrange(desc(Freq)) %>% filter(Var1 != 'gray')

p2 = plot_data %>% ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + 
     geom_bar(stat='identity', fill=plot_data$Var1) + 
     geom_hline(yintercept = mean(plot_data$Freq), color = 'gray',linetype = 'dashed') +
     ggtitle('Modules using Dynamic Hybrid') + ylab('Number of genes') + xlab('Module') + 
     theme_minimal() + theme(axis.text.x = element_blank())

grid.arrange(p1,p2, nrow=2, top = 'Comparison of Module sizes')

rm(p1, p2, plot_data)


Comparing genes left without a module


Dynamic Tree leaves 12883 genes without cluster (84%) Dynamic Hybrid leaves 2188 genes without cluster (14%)

Dynamic Tree leaves many more genes without a cluster (too many). As we saw in the dendrograms in 05_WGCNA.html, the genes usually left without a cluster are the ones closest to the root of the dendrogram, now we want to see if they share some characteristic in common

  • The group of genes with low level of expression and large values in the 2nd principal component seem to be left without a cluster more often than the rest of the genes. We had seen that this genes were noisy in the preprocessing pipeline, so it’s not so bad that we are losing them

  • Genes assigned to a cluster also seem to have a higher level of expression than the genes left unassigned (becaues they have higher PC1 values)

pca = datExpr %>% prcomp

plot_data = data.frame('ID'=rownames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>%
            left_join(genes_info, by='ID') %>% mutate('hasCluster' = DynamicTree!='gray', 
                                                      'hasSFARIScore' = `gene-score`!='Others') %>%
            apply_labels(`gene-score` = 'SFARI Gene score', DynamicTree = 'Dynamic Tree Algorithm', 
                         significant = 'Differentially Expressed', hasCluster = 'Belongs to a Module',
                         hasSFARIScore = 'Has a SFARI Score', syndromic = 'Has syndromic tag') %>%
            mutate(alpha_cluster = ifelse(hasCluster==TRUE, 0.8, 0.1),
                   alpha_signif = ifelse(significant==TRUE, 0.8, 0.1))

p = plot_data %>% ggplot(aes(PC1, PC2, color=hasCluster)) + geom_point(alpha = plot_data$alpha_cluster) + 
    theme_minimal() + ggtitle('Genes are assigned to a cluster') + theme(legend.position='bottom')

ggExtra::ggMarginal(p, type = 'density', groupColour = TRUE, size = 10)

rm(p)


Most of the genes with a SFARI score are not assigned to a cluster:

518 of the SFARI genes (~73%) are not assigned to any cluster

cro(plot_data$hasSFARIScore, list(plot_data$hasCluster, total()))
 Belongs to a Module     #Total 
 FALSE   TRUE   
 Has a SFARI Score 
   FALSE  12365 2314   14679
   TRUE  518 195   713
   #Total cases  12883 2509   15392


Conclusion:


The main difference between algorithms is that Dynamic Hybrid clusters outlier genes and Dynamic Tree leaves them out, so Dynamic Tree would give me a ‘cleaner’ group of genes to work with, but at the expense of losing too many genes

I think in this case, using the Dynamic Tree modules wouldn’t be feasible given the huge ammount of unassigned genes, so I’m going to use the Dynamic Hybrid algorithm to keep more genes

clustering_selected = 'DynamicHybrid'
genes_info$Module = genes_info[,clustering_selected]


Dynamic Hybrid Modules


*The colour of the modules is the arbitrary one assigned during the WGCNA algorithm, where the gray cluster actually represents all the genes that were left without a cluster (so it’s not actually a cluster).

The Dynamic Hybrid algorithm created 179 modules and leaves 2188 genes without a module (this is still quite large)

table(genes_info$Module)
## 
## #00A4FF #00A6FF #00A8FF #00A9FF #00ABFD #00ACFB #00AEFA #00AFF8 #00B0F6 #00B2F3 
##      85      35      19      72      73      29      27     112      39      82 
## #00B3F1 #00B4EF #00B5ED #00B6EA #00B7E7 #00B81B #00B8E5 #00B928 #00B932 #00B9E2 
##      52      32      61     126      14      32      32      35      88      25 
## #00BA3B #00BA43 #00BADF #00BB4A #00BBDC #00BC50 #00BC56 #00BCD6 #00BCD9 #00BD5C 
##      10      54     154      91      26      61      26      60      91      26 
## #00BD61 #00BDD0 #00BDD3 #00BE67 #00BE6C #00BE71 #00BEC9 #00BECD #00BF76 #00BF7B 
##      60      84      14      32      95      22     166      61      60     151 
## #00BF7F #00BFC3 #00BFC6 #00C084 #00C088 #00C08C #00C091 #00C095 #00C0B1 #00C0B4 
##      43     190     188     105      28     117      44     140      59      77 
## #00C0B8 #00C0BC #00C0BF #00C199 #00C19D #00C1A1 #00C1A5 #00C1A9 #00C1AD #08B704 
##      65      16      67      59      40     139      22      53      49      33 
## #26B700 #2DA2FF #36B600 #41A0FF #41B500 #4BB400 #519FFF #54B400 #5CB300 #5D9DFF 
##      15     120      42     216      53     102      55      69      57      66 
## #63B200 #689BFF #69B100 #6FB000 #7299FF #75AF00 #7AAE00 #7B97FF #7FAE00 #8394FF 
##     223      13      13      35      20      34      54      62      18      39 
## #84AD00 #89AC00 #8B92FF #8EAB00 #92AA00 #9390FF #96A900 #9A8EFF #9AA800 #9EA700 
##      40     205      26      47      88      76      63      37      75      89 
## #A08CFF #A2A600 #A68AFF #A6A500 #A9A300 #AC87FF #ADA200 #B0A100 #B285FF #B3A000 
##      55      58      24      40      64      31      97      18      33     114 
## #B783FF #B79F00 #BA9E00 #BC81FF #BD9D00 #C09B00 #C17FFF #C39A00 #C67DFF #C69900 
##      44      74      63      23      30      25      29      48      99     150 
## #C89800 #CA7BFF #CB9600 #CE9500 #CF78FF #D09400 #D376FF #D39300 #D59100 #D674FD 
##      69      42      51     103     165      68      38      89      31      49 
## #D89000 #DA73FB #DA8F00 #DC8D00 #DD71FA #DE8C00 #E08A00 #E16FF8 #E28900 #E46DF6 
##      39      31      13      54      55      86      14      77     273     115 
## #E48800 #E68617 #E76CF3 #E88526 #E96AF1 #EA8331 #EC69EF #EC823A #EE67EC #EE8042 
##      70      62      71      41      18      15      28      37      99      58 
## #EF7F49 #F066E9 #F17D50 #F27C56 #F365E7 #F47B5C #F564E4 #F57962 #F663E1 #F77868 
##     147      60      83      44      56      70      28      30      49      26 
## #F862DE #F8766D #F962DB #F97572 #FA7377 #FB61D8 #FB727C #FC61D5 #FC7181 #FD61D1 
##      49      33     197      73     203      69      88      91      72      35 
## #FD6F86 #FE61CE #FE6E8A #FF61C4 #FF61C7 #FF61CB #FF62BC #FF62C0 #FF63B5 #FF63B8 
##     198      23     126      24     926      32     111     108      92      49 
## #FF64B1 #FF65AD #FF66A9 #FF67A5 #FF68A0 #FF699C #FF6A98 #FF6B93 #FF6D8F    gray 
##      61     143      39      90     173     225      67      73      61    2188
plot_data = table(genes_info$Module) %>% data.frame %>% arrange(desc(Freq))

ggplotly(plot_data %>% ggplot(aes(x=reorder(Var1, -Freq), y=Freq)) + geom_bar(stat='identity', fill=plot_data$Var1) + 
         ggtitle('Module size') + ylab('Number of genes') + xlab('Module') + theme_minimal() + 
         theme(axis.text.x = element_text(angle = 90)))


Relation to external clinical traits


Quantifying module-trait associations


In the WGCNA documentation they use Pearson correlation to calculate correlations, I think all of their variables were continuous. Since I have categorical variables I’m going to use the hetcor function, that calculates Pearson, polyserial or polychoric correlations depending on the type of variables involved.

  • I’m not sure how the corPvalueStudent function calculates the p-values and I cannot find any documentation…

  • Compared correlations using Pearson correlation and with hetcor and they are very similar, but a bit more extreme with hetcor. The same thing happens with the p-values.

datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age)

# Recalculate MEs with color labels
ME_object = datExpr %>% t %>% moduleEigengenes(colors = genes_info$Module)
MEs = orderMEs(ME_object$eigengenes)

# Calculate correlation between eigengenes and the traits and their p-values
moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
rownames(moduleTraitCor) = colnames(MEs)
colnames(moduleTraitCor) = colnames(datTraits)
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nrow(datExpr))

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)

# In case there are any NAs
if(sum(!complete.cases(moduleTraitCor))>0){
  print(paste0(sum(is.na(moduleTraitCor)),' correlation(s) could not be calculated')) 
}


rm(ME_object)
  • In general, modules have strong correlations with Diagnosis with really small p-values and not much relation with anything else
# Sort moduleTraitCor by Diagnosis
moduleTraitCor = moduleTraitCor[order(moduleTraitCor[,1], decreasing=TRUE),]
moduleTraitPvalue = moduleTraitPvalue[order(moduleTraitCor[,1], decreasing=TRUE),]

# Create text matrix for the Heatmap
textMatrix = paste0(signif(moduleTraitCor, 2), ' (', signif(moduleTraitPvalue, 1), ')')
dim(textMatrix) = dim(moduleTraitCor)


labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), 
               yLabels =  gsub('ME','',rownames(moduleTraitCor)), 
               yColorWidth=0, colors = brewer.pal(11,'PiYG'), 
               bg.lab.y = gsub('ME','',rownames(moduleTraitCor)),
               textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.8, 
               cex.lab.y = 0.75, zlim = c(-1,1), main = paste('Module-Trait relationships'))

diagnosis_cor = data.frame('Module' = gsub('ME','',rownames(moduleTraitCor)),
                           'MTcor' = moduleTraitCor[,'Diagnosis'],
                           'MTpval' = moduleTraitPvalue[,'Diagnosis'])

genes_info = genes_info %>% left_join(diagnosis_cor, by='Module')


rm(moduleTraitCor, moduleTraitPvalue, textMatrix, diagnosis_cor)

Modules with a high Module-Diagnosis (absolute) correlation should have a high content of differentially expressed genes:

Not enough DE genes to test this


Gene Significance and Module Membership


Gene significance: is the value between the correlation between the gene and the trait we are interested in. A positive gene significance means the gene is overexpressed and a negative value means its underexpressed. (The term ‘significance’ is not very acurate because it’s not actually measuring statistical significance, it’s just a correlation, but that’s how they call it in WGCNA…)

Module Membership is the correlation of the module’s eigengene and the expression profile of a gene. The higher the Module Membership, the more similar the gene is to the genes that constitute the module. (I won’t use this measure yet)


Note: Some correlations weren’t able to be calculated with the original function (weirdly because the correlation was too strong), so they are calculated again in the end using the polyserial function instead of hetcor, which is robust enough to calculate them

# It's more efficient to iterate the correlations one by one, otherwise it calculates correlations between the eigengenes and also between the genes, which we don't need

# Check if MM information already exists and if not, calculate it
if(file.exists(paste0('./../Data/dataset_', clustering_selected, '.csv'))){
  
  dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
  dataset$Module = dataset[,clustering_selected]
  
} else {
  
  ############# 1. Calculate Gene Significance
  GS_info = data.frame('ID'=rownames(datExpr),
                       'GS'=datExpr %>% apply(1, function(x) hetcor(x,datMeta$Diagnosis)$correlations[1,2])) %>%
            mutate('GSpval'=corPvalueStudent(GS, ncol(datExpr)))
  
  #############  2. Calculate Module Membership
  
  #setup parallel backend to use many processors
  cores = detectCores()
  cl = makeCluster(cores-1)
  registerDoParallel(cl)
  
  # Create matrix with MM by gene
  MM = foreach(i=1:nrow(datExpr), .combine=rbind) %dopar% {
    library(polycor)
    tempMatrix = apply(MEs, 2, function(x) hetcor(as.numeric(datExpr[i,]), x)$correlations[1,2])
    tempMatrix
  }
  
  # Stop clusters
  stopCluster(cl)
  
  rownames(MM) = rownames(datExpr)
  colnames(MM) = paste0('MM',gsub('ME','',colnames(MEs)))
  
  # Calculate p-values
  MMpval = MM %>% corPvalueStudent(ncol(datExpr)) %>% as.data.frame
  colnames(MMpval) = paste0('MMpval', gsub('ME','',colnames(MEs)))
  
  MM = MM %>% as.data.frame %>% mutate(ID = rownames(.))
  MMpval = MMpval %>% as.data.frame %>% mutate(ID = rownames(.))
  
  # Join and save results
  dataset = genes_info %>% dplyr::select(ID, `gene-score`, clustering_selected, MTcor, MTpval) %>%
            left_join(GS_info, by='ID') %>%
            left_join(MM, by='ID') %>%
            left_join(MMpval, by='ID')
  
  write.csv(dataset, file = paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names = FALSE)
  
  rm(cores, cl) 
  
}

GS_missing = dataset$ID[is.na(dataset$GS)] %>% as.character

if(length(GS_missing)>0){
  
  cat(paste0(length(GS_missing),' correlations between genes and Diagnosis could not be calculated, ',
             'calculating them with the polyserial function'))
  
  for(g in GS_missing){
    dataset$GS[dataset$ID == g] = polyserial(as.numeric(datExpr[g,]), datMeta$Diagnosis)
  }
  
}
## 1 correlations between genes and Diagnosis could not be calculated, calculating them with the polyserial function
## Warning in polyserial(as.numeric(datExpr[g, ]), datMeta$Diagnosis): initial
## correlation inadmissible, -1.05918646049669, set to -0.9999
rm(GS_missing)


Analysing concordance between these metrics in the genes


1. Gene Significance vs Log Fold Change


Gene significance and Log Fold Chance are two different ways to measure the same thing, so there should be a concordance between them

Both variables agree with each other relatively well

plot_data = dataset %>% dplyr::select(ID, MTcor, GS) %>% 
            left_join(genes_info %>% dplyr::select(ID, gene.score), by='ID') %>%
            left_join(genes_info %>% dplyr::select(ID, AveExpr, log2FoldChange, significant, Module), 
                      by='ID') %>%
            left_join(data.frame(MTcor=unique(dataset$MTcor)) %>% arrange(by=MTcor) %>% 
                                 mutate(order=1:length(unique(dataset$MTcor))), by='MTcor')

ggplotly(plot_data %>% ggplot(aes(GS, log2FoldChange)) +
         geom_point(color=plot_data$Module, alpha=0.5, aes(ID=Module)) + 
         geom_smooth(color='gray', se=FALSE) + theme_minimal() + xlab('Gene Significance') + 
         ggtitle(paste0('R^2 = ', round(cor(plot_data$log2FoldChange, plot_data$GS)[1]**2, 2))))


2. Module-Diagnosis correlation vs Gene Significance


In general, modules with the highest Module-Diagnosis correlation should have genes with high Gene Significance

Note: For the Module-Diagnosis plots, if you do boxplots, you lose the exact module-diagnosis correlation and you only keep the order, so I decided to compensate this downside with a second plot, where each point is plotted individually using their module’s Module-Diagnosis correlation as the x axis. I think the boxplot plot is easier to understand but the second plot contains more information, so I don’t know which one is better.

plot_data = plot_data %>% arrange(order)

ggplotly(plot_data %>% ggplot(aes(order, GS, group=order)) + 
         geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Gene Significance'))
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
plot_data %>% ggplot(aes(MTcor, GS)) + geom_hline(yintercept=0, color='gray', linetype='dotted') + 
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', se=FALSE) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('Gene Significance') + 
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$GS)^2,2)))


3. Module-Diagnosis correlation vs Log Fold Change


The same happens with the Log Fold Change

ggplotly(plot_data %>% ggplot(aes(order, log2FoldChange, group=order)) + geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + 
         theme_minimal() + xlab('Modules ordered by Module-Diagnosis correlation') + ylab('log2FoldChange'))
ggplotly(plot_data %>% ggplot(aes(MTcor, log2FoldChange)) + 
         geom_hline(yintercept=0, color='gray', linetype='dotted') +
         geom_point(color=plot_data$Module, alpha=0.1, aes(id=ID)) + geom_smooth(color='gray', se=FALSE) + 
         theme_minimal() + xlab('Module-Diagnosis correlation') + ylab('log2FoldChange') + 
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$log2FoldChange)^2,2))))


4. Module-Diagnosis vs Mean Expression


In theory, there shouldn’t be a relation between module-diagnosis and mean expression, but in the the exploratory analysis, we saw that the overexpressed genes tended to have lower levels of expression than the overexpressed genes, and this pattern can be seen in these plots where the modules with negative Module-Diagonsis correlation have slightly higher levels of expression than the modules with positive Module-Diagnosis correlation, although this pattern is note very strong and all modules have similar levels of expression.

ggplotly(plot_data %>% ggplot(aes(order, AveExpr, group=order)) + 
         geom_hline(yintercept=mean(plot_data$AveExpr), color='gray', linetype='dotted') +
         geom_boxplot(fill=unique(plot_data$Module)) + theme_minimal() + 
         xlab('Modules ordered by Module-Diagnosis correlation') + ylab('Mean Expression'))
plot_data %>% ggplot(aes(MTcor, AveExpr)) + geom_point(alpha=.2, color=plot_data$Module, aes(id=ID)) + 
         geom_hline(yintercept=mean(plot_data$AveExpr), color='gray', linetype='dotted') + 
         geom_smooth(color='gray', alpha=0.3) + theme_minimal() + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Expression') +
         ggtitle(paste0('R^2=',round(cor(plot_data$MTcor, plot_data$AveExpr)^2,2)))


Conclusion:


All of the variables seem to agree with each other, Modules with a high correlation with Diagnosis tend to have genes with high values of Log Fold Change as well as high values of Gene Significance, and the gray module, which groups all the genes that weren’t assigned to any cluster tends to have a very poor performance in all of the metrics.






SFARI Scores


Since SFARI scores genes depending on the strength of the evidence linking it to the development of autism, in theory, there should be some concordance between the metrics we have been studying above and these scores…

SFARI Scores vs Gene Significance


  • The higher the SFARI Gene Score, the lower the Gene Significance, although this pattern is quite subtle

  • All SFARI scores have distributions similar to the genes with Neuronal annotations

  • These differences are generally not statistically significant

comparisons = list(c('1','2'), c('2','3'), c('3','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','Others'), c('2','Neuronal'),
                   c('1','Neuronal'), c('2','Others'), c('1','Others'))
increase = 0.05
base = 0.75
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% ggplot(aes(gene.score, abs(GS), fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() + 
              ylab('Gene Significance Magnitude') + xlab('SFARI Scores') + theme(legend.position='none')

rm(increase, base, pos_y_comparisons)

SFARI Scores vs Module-Diagnosis correlation


  • SFARI Score 3 has the highest Module-Diagnosis distribution of all groups, but not by much

  • There differences are usually not statistically significant

Note: Filtering out gray module because otherwise too many genes share the same Module-Diagnosis correlation

increase = 0.05
base = 0.95
pos_y_comparisons = c(rep(base, 4), rep(base + increase, 2), base + 2:5*increase)

plot_data %>% filter(Module != 'gray') %>% ggplot(aes(gene.score, abs(MTcor), fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:3,8,7))) + theme_minimal() + 
              ylab('Module-Diagnosis Correlation Magnitude') + xlab('SFARI Scores') + theme(legend.position='none')

rm(increase, base, pos_y_comparisons)

SFARI Enrichment vs Module-Diagnosis correlation


To study the strength of the relationship between a module and the biological signal found in the data related to ASD we have the Module-Diagnosis metric. We want to do a similar thing to calculate the ‘strength’ of the presence of SFARI Genes in each module to be able to identify the modules ‘enriched’ in SFARI Genes

The easiest metric would be to calculate the percentage of genes in each module that belong to the SFARI Genes, but this metric doesn’t take into account the size of the model and can favour more extreme but less robust results (small modules)

Instead, we are going to use the Over Representation Analysis (ORA) approach, that uses the hypergeometric distribution to calculate the probability of a module of size \(n\) containing at least \(k\) SFARI Genes and we are going to interpret this probability as the ‘enrichment’ of SFARI Genes in each module

Hypergeometric distribution explanation:


If we interpret the number of genes (\(n\)) in a cluster as \(n\) random draws without replacement from a finite population of size \(N\), and the number of SFARI genes in the cluster (\(s\)) as \(s\) successes in those \(n\) draws, where we know that \(N\) contains exactly \(S\) successes, then we can use the hypergeometric distribution to calculate the statistical significance of having drawn \(s\) successes out of \(n\) draws, and use this value to select the clusters with the highest statistical significance

Percentage of SFARI Genes vs ORA by Module


# Calculate % and ORA of SFARI Genes in each module

modules = unique(genes_info$Module[genes_info$Module!='gray']) %>% as.character

# We need the entrez ID of the genes for this
getinfo = c('ensembl_gene_id','entrezgene')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', 
               host='feb2014.archive.ensembl.org')
biomart_output = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), 
                       values=genes_info$ID[genes_info$Module!='gray'], mart=mart) %>%
                 left_join(genes_info %>% dplyr::select(ID, Module,`gene-score`), by = c('ensembl_gene_id'='ID'))

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(`gene-score` == 'Others', 'Others', 'SFARI'), 
                                      'gene' = entrezgene) %>%  dplyr::select(term, gene) %>% distinct


enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
                             'pval_ORA' = 0, 'padj_ORA' = 0)

for(i in 1:length(modules)){
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>% 
                        data.frame %>% dplyr::select(-geneID,-Description)
  ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
  ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
  enrichment_data[i,-1] = c(length(genes_in_module), 
                            mean(genes_info$`gene-score`[genes_info$Module==module]!='Others'), 
                            ORA_pval, ORA_padj)
}

enrichment_data = enrichment_data %>% 
                  left_join(genes_info %>% dplyr::select(Module, MTcor) %>% unique, by = 'Module')

rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)

Note: in ORA, a lower probability indicates a higher enrichment in SFARI Genes, so I’m using 1-probability to measure the enrichment of each module

The two metrics agree quite well, but if we focus on the modules with the highest ‘enrichment’ with both metrics we can see that the smallest modules are below the trend line while the larger ones are above, which shows the bias towards small modules the percentage of SFARI Genes metric has, making the ORA Enrichment metric more desirable

ggplotly(enrichment_data %>% ggplot(aes(perc_SFARI, 1-pval_ORA, size=size)) + 
         geom_point(color = enrichment_data$Module, alpha = 0.5, aes(id=Module)) + 
         geom_smooth(color='gray', se = FALSE) + 
         xlab('% of SFARI Genes') + ylab('ORA Enrichment') + ggtitle('Modules') + theme_minimal())

SFARI Enrichment vs Module-Diagnosis Correlation


There doesn’t seem to be a strong relation between these two metrics, perhaps the modules with the largest Module-Diagnosis correlations have lower Enrichment in SFARI Genes than the rest of the dataset


Cor(SFARI Enrichment, Module-Diagnosis) = 0.13

  • Correlation for modules with a positive Module-Diagnosis correlation = -0.05

  • Correlation for modules with a negative Module-Diagnosis correlation = 0.15

ggplotly(enrichment_data %>% ggplot(aes(MTcor, 1-pval_ORA, size=size)) + 
         geom_point(color=enrichment_data$Module, alpha=0.5, aes(id=Module)) + 
         geom_smooth(color='#cccccc', size = 0.5, alpha=0.1) + xlab('Module-Diagnosis Correlation') + 
         ggtitle('SFARI Enrichment vs Module-Diagnosis Correlation') +
         ylab('Enrichment in SFARI Genes') + theme_minimal() + theme(legend.position = 'none'))
# Save for comparison with old SFARI Genes
enrichment_data_new_SFARI = enrichment_data

Dividing the SFARI Genes by score we get the same pattern, just noisier because the groups are smaller now

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>%
            mutate('term' = ifelse(`gene-score` == 'Others', 'Others', `gene-score`), 
                   'gene' = entrezgene) %>% dplyr::select(term, gene) %>% distinct


enrichment_data_by_score = data.frame('Module' = as.character(), 'size' = as.numeric(), 
                                      'SFARI_score' = as.character(), 'pval_ORA' = as.numeric(), 
                                      'padj_ORA' = as.numeric())

SFARI_scores = SFARI_genes$`gene-score` %>% table %>% names

for(i in 1:length(modules)){
  
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 1e3)
  
  if(length(ORA_module)>0){
    ORA_module = ORA_module %>% data.frame %>% dplyr::select(ID, pvalue, p.adjust) %>%
                 add_column(.before = 'ID', Module = module) %>% 
                 add_column(.before = 'ID', size = length(genes_in_module)) %>%
                 dplyr::rename('SFARI_score' = ID, 'pval_ORA' = pvalue, 'padj_ORA' = p.adjust)
    
    missing_scores = SFARI_scores[!SFARI_scores %in% ORA_module$SFARI_score]
  } else{
    missing_scores = SFARI_scores
  }
  
  if(length(missing_scores)>0){
    for(ms in missing_scores) ORA_module = rbind(ORA_module, c(module, length(genes_in_module), ms, 1, 1))
  }
  
  colnames(ORA_module) = c('Module','size','SFARI_score','pval_ORA','padj_ORA')
  
  enrichment_data_by_score = rbind(enrichment_data_by_score, ORA_module)
}


enrichment_data_by_score = enrichment_data_by_score %>% 
                           left_join(genes_info %>% dplyr::select(Module, MTcor) %>% unique, by = 'Module')


enrichment_data_by_score = enrichment_data_by_score %>% 
                  mutate(color = SFARI_colour_hue(r = 1:3)[SFARI_score %>% as.numeric])

ggplotly(enrichment_data_by_score %>% 
         ggplot(aes(MTcor, 1-as.numeric(pval_ORA), size=size, color = color, fill = color)) + 
         geom_point(alpha=0.5, aes(id=Module)) + geom_smooth(size = 0.5, alpha = 0.1) +
         xlab('Module-Diagnosis Correlation') + ylab('Enrichment') + 
         scale_colour_manual(values = SFARI_colour_hue(r=1:3)) +
         scale_fill_manual(values = SFARI_colour_hue(r=1:3)) + 
         ggtitle('Enrichment by SFARI Gene Score') + 
         theme_minimal() + theme(legend.position = 'none'))
rm(i, module, genes_in_module, ORA_module, term2gene, SFARI_scores, missing_scores, ms, enrichment_data_by_score)



Old SFARI Scores


# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

plot_data = plot_data %>% dplyr::select(-gene.score) %>% 
            left_join(SFARI_genes %>% dplyr::select(ID, `gene-score`), by = 'ID') %>%
            mutate(gene.score = ifelse(!is.na(`gene-score`), `gene-score`, 
                                       ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Others')))

SFARI Scores vs Gene Significance


  • There is no clear pattern here, not a single pairwise comparison is statistically significant
comparisons = list(c('1','2'), c('2','3'), c('3','4'), c('4','5'), c('5','6'), c('6','Neuronal'), c('Neuronal','Others'),
                   c('1','3'), c('3','5'), c('5','Neuronal'), 
                   c('2','4'), c('4','6'), c('6','Others'),
                   c('1','4'), c('4','Neuronal'),
                   c('2','5'), c('5','Others'),
                   c('3','Neuronal'), c('1','5'), c('2','6'), c('3','Neuronal'), c('4','Others'))
increase = 0.04
base = 0.75
pos_y_comparisons = c(rep(base, 7), rep(base + increase,3), rep(base + 2*increase,3), rep(base + 3*increase, 2),
                      rep(base + 4*increase, 2), base + 5:9*increase)

plot_data %>% ggplot(aes(gene.score, abs(GS), fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() + 
              ylab('Gene Significance Magnitude') + xlab('SFARI Scores') + theme(legend.position='none')

rm(increase, base, pos_y_comparisons)

SFARI Scores vs Module-Diagnosis correlation


  • Most differences are not statistically significant
increase = 0.06
base = .95
pos_y_comparisons = c(rep(base, 7), rep(base + increase,3), rep(base + 2*increase,3), rep(base + 3*increase, 2),
                      rep(base + 4*increase, 2), base + 5:9*increase)

plot_data %>% filter(Module != 'gray') %>% ggplot(aes(gene.score, abs(MTcor), fill=gene.score)) + 
              geom_boxplot(outlier.colour='#cccccc', outlier.shape='o', outlier.size=3) + 
              stat_compare_means(comparisons = comparisons, label = 'p.signif', method = 't.test', 
                                 method.args = list(var.equal = FALSE), label.y = pos_y_comparisons, 
                                 tip.length = .01) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() + 
              ylab('Module-Diagnosis Correlation Magnitude') + xlab('SFARI Scores') + theme(legend.position='none')

rm(increase, base, pos_y_comparisons)

SFARI Enrichment vs Module-Diagnosis correlation


Percentage of SFARI Genes vs ORA by Module


# Calculate % and ORA of SFARI Genes in each module

# update biomart_output SFARI labels
biomart_output = biomart_output %>% dplyr::select(-`gene-score`) %>%
                 left_join(plot_data %>% dplyr::select(ID, `gene-score`), by = c('ensembl_gene_id'='ID'))

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>% mutate(term = ifelse(is.na(`gene-score`), 'Others', 'SFARI'), 
                                      'gene' = entrezgene) %>%  dplyr::select(term, gene) %>% distinct


enrichment_data = data.frame('Module' = modules, 'size' = 0, 'perc_SFARI' = 0,
                       'pval_ORA' = 0, 'padj_ORA' = 0)

for(i in 1:length(modules)){
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 50000) %>% 
                        data.frame %>% dplyr::select(-geneID,-Description)
  ORA_pval = ifelse('SFARI' %in% ORA_module$ID, ORA_module$pvalue[ORA_module$ID=='SFARI'], 1)
  ORA_padj = ifelse('SFARI' %in% ORA_module$ID, ORA_module$p.adjust[ORA_module$ID=='SFARI'], 1)
  enrichment_data[i,-1] = c(length(genes_in_module), 
                          mean(!is.na(plot_data$`gene-score`[plot_data$Module==module])), 
                          ORA_pval, ORA_padj)
}

enrichment_data = enrichment_data %>% 
                  left_join(genes_info %>% dplyr::select(Module, MTcor) %>% unique, by = 'Module')

rm(i, module, genes_in_module, ORA_module, ORA_pval, ORA_padj, getinfo, mart, term2gene)

Note: in ORA, a lower probability indicates a higher enrichment in SFARI Genes, so I’m using 1-probability to measure the enrichment of each module

The two metrics agree quite well, but if we focus on the modules with the highest ‘enrichment’ with both metrics we can see that the smallest modules are below the trend line while the larger ones are above, which shows the bias towards small modules the percentage of SFARI Genes metric has, making the ORA Enrichment metric more desirable

ggplotly(enrichment_data %>% ggplot(aes(perc_SFARI, 1-pval_ORA, size=size)) + 
         geom_point(color = enrichment_data$Module, alpha = 0.5, aes(id=Module)) + 
         geom_smooth(color='gray', se = FALSE) + 
         xlab('% of SFARI Genes') + ylab('ORA Enrichment') + ggtitle('Modules') + theme_minimal())

SFARI Enrichment vs Module-Diagnosis Correlation


There is a stronger relation between these two metrics than with the new SFARI Genes:

  • Mdules with a high negative Module-Diagnosis correlation are the ones with the lowest trend of Enrichment on SFARI Genes(!)

  • Modules with a Module-Diagnosis correlation close to zero are the ones with the highest Enrichment in SFARI Genes


Cor(SFARI Enrichment, Module-Diagnosis) = 0.11

  • Correlation for modules with a positive Module-Diagnosis correlation = -0.07

  • Correlation for modules with a negative Module-Diagnosis correlation = 0.18

ggplotly(enrichment_data %>% ggplot(aes(MTcor, 1-pval_ORA, size=size)) + 
         geom_point(color=enrichment_data$Module, alpha=0.5, aes(id=Module)) + 
         geom_smooth(color='#cccccc', size = 0.5, alpha=0.1) + xlab('Module-Diagnosis Correlation') + 
         ggtitle('SFARI Enrichment vs Module-Diagnosis Correlation') +
         ylab('Enrichment in SFARI Genes') + theme_minimal() + theme(legend.position = 'none'))

Dividing the SFARI Genes by score we get the same pattern, just much noisier because the groups are smaller now. SFARI Score 1 could be enriched in genes with high (positive) Module-Diagnosis correlation, but this pattern also could be just because there are very few genes with a SFARI Score 1, making the results unreliable

# We need to build a term2gene dataframe with the genes and their SFARI Scores
term2gene = biomart_output %>%
            mutate('term' = ifelse(`gene-score` == 'Others', 'Others', `gene-score`), 
                   'gene' = entrezgene) %>% dplyr::select(term, gene) %>% distinct


enrichment_data_by_score = data.frame('Module' = as.character(), 'size' = as.numeric(), 
                                      'SFARI_score' = as.character(), 'pval_ORA' = as.numeric(), 
                                      'padj_ORA' = as.numeric())

SFARI_scores = SFARI_genes$`gene-score` %>% table %>% names

for(i in 1:length(modules)){
  
  module = modules[i]
  genes_in_module = biomart_output$entrezgene[biomart_output$Module==module]
  ORA_module = enricher(gene = genes_in_module, universe = biomart_output$entrezgene %>% as.character, 
                        pAdjustMethod = 'bonferroni', TERM2GENE = term2gene, 
                        pvalueCutoff = 1, qvalueCutoff = 1, maxGSSize = 1e3)
  
  if(length(ORA_module)>0){
    ORA_module = ORA_module %>% data.frame %>% dplyr::select(ID, pvalue, p.adjust) %>%
                 add_column(.before = 'ID', Module = module) %>% 
                 add_column(.before = 'ID', size = length(genes_in_module)) %>%
                 dplyr::rename('SFARI_score' = ID, 'pval_ORA' = pvalue, 'padj_ORA' = p.adjust)
    
    missing_scores = SFARI_scores[!SFARI_scores %in% ORA_module$SFARI_score]
  } else{
    missing_scores = SFARI_scores
  }
  
  if(length(missing_scores)>0){
    for(ms in missing_scores) ORA_module = rbind(ORA_module, c(module, length(genes_in_module), ms, 1, 1))
  }
  
  colnames(ORA_module) = c('Module','size','SFARI_score','pval_ORA','padj_ORA')
  
  enrichment_data_by_score = rbind(enrichment_data_by_score, ORA_module)
}


enrichment_data_by_score = enrichment_data_by_score %>% 
                           left_join(genes_info %>% dplyr::select(Module, MTcor) %>% unique, by = 'Module')


enrichment_data_by_score = enrichment_data_by_score %>% 
                  mutate(color = SFARI_colour_hue(r = 1:6)[SFARI_score %>% as.numeric])

ggplotly(enrichment_data_by_score %>% 
         ggplot(aes(MTcor, 1-as.numeric(pval_ORA), size=size, color = color, fill = color)) + 
         geom_point(alpha=0.3, aes(id=Module)) + geom_smooth(size = 0.5, alpha = 0.05) +
         xlab('Module-Diagnosis Correlation') + ylab('Enrichment') + 
         scale_colour_manual(values = SFARI_colour_hue(r=1:6)) +
         scale_fill_manual(values = SFARI_colour_hue(r=1:6)) + 
         ggtitle('Enrichment by SFARI Gene Score') + 
         theme_minimal() + theme(legend.position = 'none'))
rm(i, module, genes_in_module, ORA_module, term2gene, SFARI_scores, missing_scores, ms, enrichment_data_by_score)



Comparison between Old and New SFARI Genes


  • With both SFARI Gene lists, there seems to be a megative correlation between the magnitude of the Mdoule-Diagnosis correlation and the SFARI Enrichment by module

  • The negative enrichment in modules with high negative Module-Diagnosis correlation seems to be a bit stronger with the old SFARI Genes scoring criteria, but they are quite similar

compare_plot = enrichment_data %>% dplyr::mutate('Old_SFARI' = pval_ORA) %>% 
               dplyr::select(Module, size, MTcor, Old_SFARI) %>%
               left_join(enrichment_data_new_SFARI %>% dplyr::mutate('New_SFARI' = pval_ORA) %>%
                         dplyr::select(Module, New_SFARI), by = 'Module') %>% 
               melt(id.vars = c('Module','size','MTcor')) %>% dplyr::rename('SFARI_version' = variable) %>%
               mutate(size = sqrt(100*as.numeric(size)/max(as.numeric(size))))

p = compare_plot %>% ggplot(aes(MTcor, 1-as.numeric(value), color = SFARI_version, fill = SFARI_version)) +
    geom_point(alpha = 0.5, size=compare_plot$size, aes(id=Module)) + geom_smooth(alpha = 0.1) + 
    xlab('Module-Diagnosis Correlation') + ylab('Enrichment in SFARI Genes') + 
    ggtitle('SFARI Enrichment vs Module-Diagnosis Correlation') +
    theme_minimal() + theme(legend.position = 'bottom')
          
ggExtra::ggMarginal(p, type = 'density', margins = 'y', groupColour = TRUE, groupFill = TRUE, size=10)

rm(p, compare_plot)



Conclusion:


  • SFARI genes not consistent with the other measurements, they even sometimes seem to contradict them, although the strength of the relations is usually not strong enough to be statistically significant

  • There is a negative relation between SFARI Enrichment and Module-Diagnosis correlation magnitude, being the modules with negative Module-Diagnosis correlation the most affected

  • These patterns are visible in both old and new SFARI Scoring systems, but they are perhaps a bit clearer with the old SFARI scoring system



Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.15      iterators_1.0.12       foreach_1.5.0         
##  [4] clusterProfiler_3.12.0 biomaRt_2.40.5         polycor_0.7-10        
##  [7] expss_0.10.2           WGCNA_1.69             fastcluster_1.1.25    
## [10] dynamicTreeCut_1.63-1  ggExtra_0.9            ggpubr_0.2.5          
## [13] magrittr_1.5           GGally_1.5.0           gridExtra_2.3         
## [16] viridis_0.5.1          viridisLite_0.3.0      RColorBrewer_1.1-2    
## [19] dendextend_1.13.4      plotly_4.9.2           glue_1.4.1            
## [22] reshape2_1.4.4         forcats_0.5.0          stringr_1.4.0         
## [25] dplyr_1.0.0            purrr_0.3.4            readr_1.3.1           
## [28] tidyr_1.1.0            tibble_3.0.1           ggplot2_3.3.2         
## [31] tidyverse_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1          backports_1.1.8       fastmatch_1.1-0      
##   [4] Hmisc_4.4-0           igraph_1.2.5          plyr_1.8.6           
##   [7] lazyeval_0.2.2        splines_3.6.3         crosstalk_1.1.0.1    
##  [10] BiocParallel_1.18.1   urltools_1.7.3        digest_0.6.25        
##  [13] GOSemSim_2.10.0       htmltools_0.4.0       GO.db_3.8.2          
##  [16] fansi_0.4.1           checkmate_2.0.0       memoise_1.1.0        
##  [19] cluster_2.1.0         graphlayouts_0.7.0    modelr_0.1.6         
##  [22] matrixStats_0.56.0    enrichplot_1.4.0      prettyunits_1.1.1    
##  [25] jpeg_0.1-8.1          colorspace_1.4-1      ggrepel_0.8.2        
##  [28] blob_1.2.1            rvest_0.3.5           haven_2.2.0          
##  [31] xfun_0.12             crayon_1.3.4          RCurl_1.98-1.2       
##  [34] jsonlite_1.7.0        impute_1.58.0         survival_3.1-12      
##  [37] polyclip_1.10-0       gtable_0.3.0          UpSetR_1.4.0         
##  [40] BiocGenerics_0.30.0   scales_1.1.1          DOSE_3.10.2          
##  [43] mvtnorm_1.1-0         DBI_1.1.0             miniUI_0.1.1.1       
##  [46] Rcpp_1.0.4.6          xtable_1.8-4          progress_1.2.2       
##  [49] htmlTable_1.13.3      gridGraphics_0.5-0    europepmc_0.4        
##  [52] foreign_0.8-76        bit_1.1-15.2          preprocessCore_1.46.0
##  [55] Formula_1.2-3         stats4_3.6.3          htmlwidgets_1.5.1    
##  [58] httr_1.4.1            fgsea_1.10.1          acepack_1.4.1        
##  [61] ellipsis_0.3.1        farver_2.0.3          pkgconfig_2.0.3      
##  [64] reshape_0.8.8         XML_3.99-0.3          nnet_7.3-14          
##  [67] dbplyr_1.4.2          labeling_0.3          ggplotify_0.0.5      
##  [70] tidyselect_1.1.0      rlang_0.4.6           later_1.0.0          
##  [73] AnnotationDbi_1.46.1  munsell_0.5.0         cellranger_1.1.0     
##  [76] tools_3.6.3           cli_2.0.2             generics_0.0.2       
##  [79] RSQLite_2.2.0         ggridges_0.5.2        broom_0.5.5          
##  [82] evaluate_0.14         fastmap_1.0.1         yaml_2.2.1           
##  [85] knitr_1.28            bit64_0.9-7           fs_1.4.0             
##  [88] tidygraph_1.2.0       ggraph_2.0.3          nlme_3.1-147         
##  [91] mime_0.9              DO.db_2.9             xml2_1.2.5           
##  [94] compiler_3.6.3        rstudioapi_0.11       curl_4.3             
##  [97] png_0.1-7             ggsignif_0.6.0        reprex_0.3.0         
## [100] tweenr_1.0.1          stringi_1.4.6         lattice_0.20-41      
## [103] Matrix_1.2-18         vctrs_0.3.1           pillar_1.4.4         
## [106] lifecycle_0.2.0       BiocManager_1.30.10   triebeard_0.3.0      
## [109] cowplot_1.0.0         data.table_1.12.8     bitops_1.0-6         
## [112] qvalue_2.16.0         httpuv_1.5.2          R6_2.4.1             
## [115] latticeExtra_0.6-29   promises_1.1.0        IRanges_2.18.3       
## [118] codetools_0.2-16      MASS_7.3-51.6         assertthat_0.2.1     
## [121] withr_2.2.0           S4Vectors_0.22.1      mgcv_1.8-31          
## [124] hms_0.5.3             grid_3.6.3            rpart_4.1-15         
## [127] rvcheck_0.1.8         rmarkdown_2.1         ggforce_0.3.1        
## [130] Biobase_2.44.0        shiny_1.4.0.2         lubridate_1.7.4      
## [133] base64enc_0.1-3